Tabula Sapiens Analyses : RF feature reduction

# I M P O R T
library(stringr)
library (plyr) 
library(ggplot2)
library(viridis)
library(UpSetR)
library(ggrepel)
library(factoextra)
library(FactoMineR)
library(viridis)
library(RColorBrewer)
library(dplyr)
library(pheatmap)
library(DT) 
library(cluster)
library(grid)
# V A R I A B L E S
#main.dir = "/run/user/1001/gvfs/sftp:host=curta.mcia.fr,user=acolajanni/gpfs/home/acolajanni"
path = "/run/user/1001/gvfs/sftp:host=core.cluster.france-bioinformatique.fr,user=acolajanni/shared/projects/microbiome_translocation"
local_path = "/home/acolajanni/Documents/work/"
#setwd(main.dir)

#path = "/home/acolajanni/Documents/work/"
data_dir = file.path(path, "data/Tabula_sapiens_immune_all")

result_dir = file.path(path, "results/Tabula_sapiens/RF_filter_50k/")
#setwd(path)

git_dir = file.path(local_path,"GITHUB/Signature_matrix_TabulaSapiens/")
git_geneset = file.path(git_dir,"genesets/")
################################################################################
# F U N C T I O N S

create_dir = function(directory){
  if ( ! file.exists(directory)){
    dir.create(directory)  }
  return(directory) }

replace_label = function(label, name_mapping){
  
  label_names = names(name_mapping)
  label = ifelse(str_detect(label, label_names[1]),name_mapping[1],label)
  label = ifelse(str_detect(label,label_names[2]),name_mapping[2],label)
  label = ifelse(label %in% c(name_mapping[1], name_mapping[2]), yes = label, no = name_mapping[3])
  #label=as.numeric(label)
  return(label) }


confidence_interval = function(perm_df, n_perm = 20) {
  t.score=qt(p=0.025, df=n_perm-1 ,lower.tail = FALSE)
  perm_df$se = perm_df$std_importance/sqrt(n_perm-1)
  perm_df$margin_error = t.score*perm_df$se
  perm_df$binf = perm_df$mean_importance - perm_df$margin_error
  perm_df$bsup = perm_df$mean_importance + perm_df$margin_error
  
  return(perm_df) }


importance_barplot = function(permutation_df,permutation_number=50, filter="mean", title="Mean feature importance barplot",
                              conf_interval = TRUE){
  if (conf_interval){
    permutation_df = confidence_interval(permutation_df, permutation_number) 
  }
  else {
    permutation_df$binf = permutation_df$mean_importance
    permutation_df$bsup = permutation_df$mean_importance
  }
  non_null_feature = length(permutation_df[permutation_df$mean_importance != 0 , ]$mean_importance)
  pos_feature = length(permutation_df[permutation_df$mean_importance > 0 , ]$mean_importance)

  
  if (filter == "mean") { 
    permutation_df = permutation_df[permutation_df$mean_importance > 0 , ]
    subtitle = paste(non_null_feature, "Genes with non null feature importance among which", pos_feature, "have a positive importance") }
  else {  
    sig_pos_feature = length(permutation_df[permutation_df$binf > 0 , ]$mean_importance)
    subtitle = paste(non_null_feature, "Genes with non null feature importance among which", pos_feature, "have a positive importance and \n ",
                     sig_pos_feature, "are significantly different than zero.")
    
    if (filter == "bsup") { permutation_df = permutation_df[permutation_df$bsup > 0 , ] }
    else if (filter == "binf") { permutation_df = permutation_df[permutation_df$binf > 0 , ] }
    else if (filter == "zero") { permutation_df = permutation_df[permutation_df$mean_importance != 0 , ] }
  }
  
  
  permutation_df$genes = row.names(permutation_df)
  n_gene = length(row.names(permutation_df))
  ordered_permutation_df = permutation_df[order(permutation_df$mean_importance, decreasing = TRUE),]
  
  plot = ggplot( permutation_df, aes(x = reorder(genes,mean_importance), y = mean_importance, fill=mean_importance ) )  +
    geom_col(position = position_dodge(0), width = 0.75) + 
    geom_errorbar(aes(ymin=binf, ymax=bsup ),
                size=.3,width=.2, position=position_dodge(.9)) +
    geom_hline(aes(yintercept = 0), alpha=0.20) +
    coord_flip() +
    labs(fill = "Cluster") +
    ylab('Mean Permutation Feature Importance') + xlab('Gene name')+
    ggtitle(title, subtitle = subtitle ) +
  
    scale_fill_viridis(discrete = FALSE, alpha=0.65, direction = -1, option='C', name="Mean Feature Importance") +
    theme_linedraw()+  
    theme( plot.title = element_text(size=15),
           axis.text.y = element_text(size = 8))
  return(list("plot"=plot, 
              "ordered_permutation_dataframe" = ordered_permutation_df[,-c(1:permutation_number)],
              "permutation_dataframe"= permutation_df)) }


add_annotation_line_barplot_importance = function(plot, permutation_df, last_gene, annotation = "most important features", adjust_annotation = .25, adjust_annotationX=0.85){
  gene_value = permutation_df[permutation_df$genes == last_gene,]$mean_importance
  gene_rank = length(permutation_df[permutation_df$mean_importance >= gene_value,]$genes)
  
  plot = plot +
    geom_vline( xintercept = which(permutation_df$genes == last_gene)-0.5, linetype='dashed' )+
    annotate("text", x=which(permutation_df$genes == last_gene)+adjust_annotation , y=max(permutation_df$mean_importance)*adjust_annotationX,
             label= paste(gene_rank,annotation) )
    return(plot) }

RA_plot = function(FC_df, x, y, title = "RA plot", highlight=c('TOP20'), annotation = "Most Important Features", vjust = 2, force=3, y_scale = 0, add_label=TRUE){
  
  FC_df$selection = ifelse(FC_df[[x]] > quantile(FC_df[[x]],0.75) & abs(FC_df[[y]]) > 1 , 
                            yes = "Selected",no = "Not selected")
  selected_number= length(FC_df$selection[FC_df$selection == "Selected"])
    
  if (highlight[1] == "TOP20") {
    FC_df$selection = ifelse(abs(FC_df[[y]]) > sort(abs(FC_df[[y]]),TRUE )[21]  , 
                            yes = "20 Highest FC",no = FC_df$selection)
    FC_df$label = ifelse(FC_df$selection == "20 Highest FC", 
                          yes=row.names(FC_df), no="") }
  else{
    FC_df$selection = ifelse(row.names(FC_df) %in% highlight , 
                            yes = annotation, no = FC_df$selection)
    FC_df$label = ifelse(FC_df$selection == annotation, 
                          yes=row.names(FC_df), no="") }
  
  
  maximum_y = max(abs(FC_df[[y]])) + y_scale
  title =paste( c(title, "\n Gene selected :",selected_number),collapse = " " )
  
  plot = ggplot(FC_df, aes(x=FC_df[[x]], y=FC_df[[y]]))+
    geom_point(aes(color=selection, alpha=selection, size = selection))+
    geom_smooth(color="darkred",size=0.5)+
    annotate("text", x=2.4, y=-1-0.5, label="log2 FC = -1", colour = "darkblue") +
    geom_hline(yintercept = -1, color='darkblue',linetype='dashed') +
    annotate("text", x=2.4, y= 1+0.5, label="log2 FC = 1", colour = "darkblue") +
    coord_cartesian(ylim=c(-maximum_y,maximum_y))+
    geom_hline(yintercept = 1, color='darkblue',linetype='dashed') +
    geom_vline(xintercept = quantile(FC_df[[x]],0.75),linetype='dashed') +
    annotate("text", y=-5.5, x= quantile(FC_df[[x]],0.75)/4, 
           label="3rd quartile \n of mean expression", colour = "black") +
    
    scale_color_manual(values = c("orangered","#777777","#00539CFF"))+
    scale_alpha_manual(values=c(1,0.25, 0.75), guide='none')+
    scale_size_manual(values=c(2,0.5,1), guide='none')+
    theme_linedraw()+
    theme(legend.position = c(0.85, 0.15) ,legend.direction = 'vertical') +
    labs(color="Features", )+
    xlab("Average log expression") +
    ylab("log ratio of expression") +
    ggtitle(title)
  
  if (add_label){
    plot = plot +
      geom_text_repel(data=FC_df, label = FC_df$label,
                    vjust = vjust, force=force, max.overlaps = 100) }
  
  return(list("plot"=plot, "df"=FC_df)) }


visualize_annotation = function(Annotation_table, celltype){
  Freq_annot = as.data.frame(table(Annotation_table$goslim_goa_description))

  plot = ggplot(Freq_annot, aes(x = reorder(Var1,Freq), y = Freq, fill=Freq)) + 
          geom_col() + 
          coord_flip() +
          ggtitle(paste("Frequence of annotation terms for", celltype ,"geneset"))+
          xlab('GOSLIM GOA annotation terms') + ylab('Frequence')+ labs(fill="Frequence") +
          theme_linedraw() +
          theme(axis.text = element_text(size=12, face = "bold"))
  
  df = rmarkdown::paged_table(unique(Annotation_table[,c("hgnc_symbol","entrezgene_description")])  ) %>% datatable 
  
  return(list("df" = df, "plot"=plot)) }

geneset_to_binary_df = function(geneset_list){
  geneset_list_df = lapply(geneset_list, function(x) as.data.frame(x))
  tmp = ldply(geneset_list_df)
  tmp = reshape2::dcast(tmp, x ~ .id)
  row.names(tmp) = tmp$x
  tmp$x = NULL
  tmp = ifelse(is.na(tmp), 0,1)   
  return( as.data.frame(tmp) ) }

heatmap_from_geneset_list = function(geneset_list, tabula_column, title = " ", 
                                     min_representation = 2, cutree_rows = 1,
                                     clust=TRUE){
  
  n_cut = length(names(geneset_list))
  all_tabula = geneset_list[[tabula_column]]
  all_signature = unique(unlist(geneset_list[ names(geneset_list) != tabula_column ]))
  
  unique_tabula = all_tabula[! all_tabula %in% all_signature]
  Full_geneset = unique(unlist(geneset_list))
  geneset_list[["all"]] = Full_geneset
  df_heatmap = fromList(geneset_list)
  row.names(df_heatmap) = geneset_list$all 
  
  main = paste0(title , "\n", 
        "Presented genes appears in at least ", min_representation, 
        " signature matrix, or in Tabula Sapiens \n",
        length(unique_tabula), 
        " Features uniquely predicted in Tabula sapiens geneset " , 
        "\n 1 : Detected by signature matrix",
        "\n 0 : Not detected by signature matrix")
  
  if (! clust) {return(list("title"=main, "df" = df_heatmap))}
  
  pheatmap::pheatmap(df_heatmap[ 
    rowSums(df_heatmap) > min_representation | df_heatmap[[tabula_column]] == 1 , 
    colnames(df_heatmap) != "all"],
      clustering_distance_rows = "correlation", clustering_distance_cols = "correlation",
      cutree_cols = n_cut, cutree_rows = cutree_rows,
      color = c("#555555","red"), 
      na_col="lightgray",breaks = c(0, .5 , 1),
      fontsize_row = 14, fontsize_col = 15, angle_col = 0, legend_breaks = c(0,1),
      main = main )
}

FC_prop_expr = function(df, n_cell, max_cell){
  
  columns = colnames(df)[str_detect(colnames(df), "prop_expr")]
  A = columns[!str_detect(columns, "vs")]
  B = columns[ str_detect(columns, paste0("other")) ]

  df[[paste0("FC_",A)]] = compute_FC( df[[A]] , df[[B]] , 0)
  return(df) }

kneedle_threshold = function(df, y, sens = 1, min = 0){
  df$value = abs(as.numeric(y))
  df = df[order(df$value, decreasing = TRUE),]
  df$rank = 1:length(y)
  elbow = kneedle::kneedle(x = df$rank, y = df$value, decreasing = TRUE, concave = FALSE, sensitivity = sens)
  
  plot = ggplot(df, aes(x=rank,y=value))+
    geom_line()+
    geom_hline(yintercept = elbow[2], color="blue",alpha=0.5) + geom_vline(xintercept = elbow[1], color="blue",alpha=0.5)+
    ylab("y value") + xlab("Gene Rank")
  
  if (elbow[2] < min) {elbow[2] = min}
  
  return(list("rank" = elbow[1], "threshold" = elbow[2], "plot"=plot )) }

Rprop_plot = function(FC_df, FC_x, y, title = "plot", highlight=c('TOP20'), annotation = "Most Important Features", 
                      vjust = 2, force=3, x_scale = 0, add_label=TRUE, 
                      FC_threshold = 1, prop_threshold=0.25, FC_thresh2 = 1,
                      prop_threshold_to_show = prop_threshold) {
  
  FC_df$selection = ifelse(abs(FC_df[[FC_x]]) > FC_threshold & abs(FC_df[[y]]) > prop_threshold , 
                            yes = "Selected",no = "Not selected")
  Display_2nd_annotation = FALSE
  
  if (FC_threshold <= FC_thresh2){
    FC_df$selection = ifelse(abs(FC_df[[FC_x]]) >= FC_thresh2, yes = "Selected", no = FC_df$selection ) 
    Display_2nd_annotation = TRUE }
  
  selected_number= length(FC_df$selection[FC_df$selection == "Selected"])
    
  if (highlight[1] == "TOP20") {
    
    tmp_df = FC_df[FC_df[[y]] > prop_threshold &  abs(FC_df[[FC_x]]) > FC_threshold , ]
    tmp_df = tmp_df[with(tmp_df, order(abs(tmp_df[[FC_x]]) , decreasing = TRUE)),]
     if ( length(row.names(tmp_df)) > 20) { selected = row.names(tmp_df)[1:20] } 
     else{  selected = row.names(tmp_df) }

    FC_df$selection = ifelse( row.names(FC_df) %in% selected, 
                            yes = "20 Highest FC expressed in most cells",no = FC_df$selection)
    FC_df$label = ifelse(FC_df$selection == "20 Highest FC expressed in most cells", 
                          yes=row.names(FC_df), no="") }
  
  if(highlight[1] == "selection"){
    FC_df$label = ifelse(FC_df$selection == "Selected", 
                          yes=row.names(FC_df), no="") } 
  else{
    FC_df$selection = ifelse(row.names(FC_df) %in% highlight , 
                            yes = annotation, no = FC_df$selection)
    FC_df$label = ifelse(FC_df$selection == annotation, 
                          yes=row.names(FC_df), no="") }
  
  
  maximum_x = max(abs(FC_df[[FC_x]])) + x_scale
  title =paste( c(title, "\n Gene selected :",selected_number),collapse = " " )
  
  plot = ggplot(FC_df, aes(x=FC_df[[FC_x]], y=FC_df[[y]]))+
    geom_point(aes(color=selection, alpha=selection, size = selection))+

    # Annotation of first filter
    annotate("text", y=-0.02, x=-FC_threshold-0.75, 
             label=paste0("log2 FC = -",FC_threshold), 
             colour = "darkblue") +
    geom_vline(xintercept = -FC_threshold, color='darkblue',linetype='dashed') +
    
    annotate("text", y=-0.02, x= FC_threshold+0.75, 
             label=paste0("log2 FC = ",FC_threshold), 
             colour = "darkblue") +
    geom_vline(xintercept = FC_threshold, color='darkblue',linetype='dashed') +
    # Proportion line
    geom_hline(yintercept = prop_threshold, color='darkblue',linetype='dashed') +

    
    coord_cartesian(xlim=c(-maximum_x,maximum_x), ylim=c(-0.025,1)) +
    
    scale_color_manual(values = c("orangered","#777777","#00539CFF"))+
    scale_alpha_manual(values=c(1,0.25, 0.75), guide='none')+
    scale_size_manual(values=c(2,0.5,1), guide='none')+
      
    theme_linedraw()+
    theme(legend.position = c(0.15, 0.85) ,legend.direction = 'vertical') +
    labs(color="Features")+
    ylab("Proportion of cells that express a gene") +
    xlab("log ratio of expression") +
    ggtitle(title)
    
  if (add_label){
    plot = plot +
      geom_text_repel(data=FC_df, label = FC_df$label,
                    vjust = vjust, force=force, max.overlaps = 100) }
      # Annotation of 2nd filter
  if (Display_2nd_annotation){
    plot = plot +
    annotate("text", y=-0.04, x=-FC_thresh2-1, 
             label=paste0("log2 FC = -",round(FC_thresh2, digits = 3)), 
             colour = '#555555') +
    geom_vline(xintercept = -FC_thresh2, color='#555555',linetype='dashed') +
    
    annotate("text", y=-0.04, x= FC_thresh2+1, 
             label=paste0("log2 FC = ",label=round(FC_thresh2, digits = 3)  ), 
             colour = '#555555') +
    geom_vline(xintercept = FC_thresh2, color='#555555',linetype='dashed')  }
  
  if (prop_threshold < 0.1) {
      plot = plot + annotate("text", y=0.05, x= maximum_x*0.75, 
             label=paste0("Proportion of \n cells expressing = ", prop_threshold_to_show), 
             colour = "darkblue") }
  else {
      plot = plot + annotate("text", y=prop_threshold*1.15, x= -maximum_x*0.75, 
             label=paste0("Proportion of cells expressing = ", round(prop_threshold, 3)), 
             colour = "darkblue") }
  
  return(list("plot"=plot, "df"=FC_df)) }


get_permutation_list = function(directory,pattern2,pattern1="Permutation_result_"){
  # Import files
  #directory = file.path(directory)
  tmp = list.files(directory, full.names = TRUE, 
                   recursive = TRUE, pattern=pattern1)

  if (pattern1 == "Permutation_result_") {
    Permutation_list = lapply(tmp, 
                            function(x) x = read.csv2(x, header = TRUE, 
                                            sep = ",", row.names = 1))}
  else {
    Permutation_list = lapply(tmp, function(x) {
      x = read.table(x, header = TRUE , sep=",", row.names=2) } )} 
  
  names(Permutation_list) = tmp
  # Rename each dataframe
  names(Permutation_list) = str_remove_all(
    string = names(Permutation_list) , 
    pattern = paste0(directory , pattern2))
  names(Permutation_list) = str_remove_all(string = names(Permutation_list) , 
                                           pattern =".csv")
  # Change data to numeric
  Permutation_list = lapply(Permutation_list, function(df) {
    df = dplyr::mutate_all(df, function(x) as.numeric(as.character(x) )) } )
  
  # return(list('l'=0,
  #             "dir"=paste0(directory , pattern2),
  #             "names"=names(Permutation_list) ) ) }
  return(Permutation_list)  }


get_genes_from_binary=function(binary_df, expressed_genes=NULL, filter_expressed = FALSE ){
  cell_types = colnames(binary_df)
  cell_geneslist = lapply(cell_types, function(cell){
    #gene_list = list()
    gene_list = row.names(binary_df[ binary_df[[cell]] != 0 , ])
    gene_list = gene_list[gene_list %in% expressed_genes]
    return(gene_list) })
  
  names(cell_geneslist) = cell_types  
  return(cell_geneslist) }


compute_median_expr=function(df,col_to_keep){
  df = df[,col_to_keep]
  df$genes = row.names(df)
  df_melt = reshape2::melt(df)
  return (median(df_melt$value)) }

compute_Avalue=function(expr_A, expr_B){ 
  return(0.5* ( log2(expr_A+1) + log2(expr_B+1) ) ) }
  #return(0.5* ( expr_A + expr_B ) ) }

compute_FC=function(expr_A, expr_B, median_expr){ 
  return(log2( (expr_A+median_expr ) / (expr_B+median_expr) )) }
  #return( (expr_A+median_expr ) / (expr_B+median_expr) ) } 

Get_value_RA = function(df, median_expr){
  A = colnames(df)[!str_detect(colnames(df), "vs|prop_expr")]
  B = colnames(df)[ str_detect(colnames(df), paste0("other_vs_",A)) ]
  B = B[! str_detect( B, "prop_expr")  ]
  df[[paste0("FC_",A)]] = compute_FC( df[[A]] , df[[B]] ,median_expr)
  df[[paste0("Avalue_",A)]] = compute_Avalue( df[[A]] , df[[B]] )
  return(df)
}



build_confusion_heatmap = function(conf_mat,cell_order, 
                                   filter = FALSE, 
                                   column_filter="signature_matrix",
                                   value_filter = "LM22",
                                   direction_color_palette = -1){
  cells_order = as.factor(cell_order)
  cells_order_Y = as.factor(rev(cell_order))
  
  if (direction_color_palette > 0){
    color1 = "white"
    color2 = "black" 
  } else {
    color1 = "black"
    color2 = "white" 
  }
  
  conf_mat$Percentage = round(conf_mat$Percentage*100,2)
  conf_mat$Percentage_show = paste0(as.character(conf_mat$Percentage),"%")
  
  if (filter){ conf_mat = conf_mat[conf_mat[[column_filter]] == value_filter , ] } 
  
  plot = ggplot(data = conf_mat, 
                aes(x=factor(Predicted_label, levels = cells_order), 
                            y=factor(True_label, levels = cells_order_Y), 
                            fill=Percentage)) + 
  geom_tile(color="white")+
  # % cells predicted
  geom_text(data = conf_mat[conf_mat$Percentage > 30,],
    aes(label = Percentage_show), color = color2, size = 3, vjust=-1) +
  geom_text(data = conf_mat[conf_mat$Percentage <= 30,],
    aes(label = Percentage_show), color = color1, size = 3, vjust=-1) +
  
  # Number of cells predicted 
  geom_text(data = conf_mat[conf_mat$Percentage > 30,],
    aes(label = value), color = color2, size = 3, vjust=1) +
  geom_text(data = conf_mat[conf_mat$Percentage <= 30,],
    aes(label = value), color = color1, size = 3, vjust=1) +
  
  xlab("Predicted labels") + ylab("True labels")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
  scale_fill_viridis(option = "rocket", direction = direction_color_palette)
  
  return(list("plot"=plot, "conf_mat"=conf_mat)) } 

get_confusion = function(directory,pattern2,pattern1="confusion_heatmap_", 
                         cells ){
  # Import files
  tmp = list.files(directory, full.names = TRUE, 
                   recursive = TRUE, pattern=pattern1)

  conf_heamtap = read.csv2(tmp, header = FALSE, sep = " ", row.names = NULL)
  colnames(conf_heamtap) = cells
  row.names(conf_heamtap) = cells
  conf_mat_percent = conf_heamtap
  
  for (i in seq(1,nrow(conf_heamtap) )) {
    ncell = sum(conf_heamtap[i,])
    conf_mat_percent[i,] = conf_heamtap[i,] / ncell }
  
  colnames(conf_mat_percent) = cells
  row.names(conf_mat_percent) = cells
  conf_mat_percent$True_label = cells
  conf_mat_percent = reshape2::melt(conf_mat_percent)

  colnames(conf_heamtap) = cells
  row.names(conf_heamtap) = cells
  conf_heamtap$True_label = cells
  conf_heamtap = reshape2::melt(conf_heamtap)

  colnames(conf_mat_percent) = c("True_label","Predicted_label","Percentage")
  colnames(conf_heamtap) = c("True_label","Predicted_label","value")
  conf_mat = merge(conf_mat_percent,conf_heamtap)
  
  return(conf_mat)  }
confusion_matrix_setup = function(directory,pattern2,
                                  cells, signature_matrix,
                                  pattern1="confusion_heatmap_"){
  cf = get_confusion(
    directory = directory ,
    pattern2= "/confusion_heatmap_",
    cells = cells)
    
  balanced_acc = round(mean(cf[cf$True_label == cf$Predicted_label,]$Percentage)*100,2)
  cf$subtitle=paste0(signature_matrix, "\n Balanced accuracy = ",balanced_acc,"%")
  cf$signature_matrix = signature_matrix
  return(cf) }

merge_confusion_matrix = function(pattern2, tbs_path,lm22_path,til10_path,cells){
  cf1 = confusion_matrix_setup(
    directory = tbs_path,
    pattern2= "/confusion_heatmap_",
    cells = cells,
    signature_matrix = "Tabula Sapiens")
  cf2 = confusion_matrix_setup(
    directory = lm22_path,
    pattern2= "/confusion_heatmap_",
    cells = cells,
    signature_matrix = "LM22")
  
  cf3 = confusion_matrix_setup(
    directory = til10_path,
    pattern2= "/confusion_heatmap_",
    cells = cells,
    signature_matrix = "TIL10")
  return(rbind(cf1,cf2,cf3)) }

Processing of signature matrix

Cell Types

Filtering genes - Cell vs Cell

CD4 vs CD8

61 Features selected

3435 features

NK vs CD8

79 Features selected

1028 features

247 features

Naive vs Memory Bcells

42 Features selected

3582 features

Filtering genes - Cell vs all

NK vs other

178 Features selected

926 features

Performances

CD8 vs other

79 Features selected

639 features

Performances

CD4 vs other

114 Features selected

1402 features

319 features

Performances

Naive Bcell vs other

29 Features selected

6098 features

Performances

Memory Bcell vs other

53 Features selected

7208 features

Performances

Neutrophils vs other

84 Features selected

825 features

Performances - No neutrophils in Liu2021 data

Plasmocyte vs other

19 Features selected

5172 features

Performances - No Plasmocytes in Liu2021 data / TIL10

Monocyte vs other

174 Features selected

4022 features

1870 features

369 features

Performances

Macrophage vs other

68 Features selected

369 features

Performances - No Macrophages in Liu2021 data

Platelet vs other

40 Features selected

510 features

Performances - Platelet unique to Tabula Sapiens

Combining Features

## $NK_cell

## 
## $Neutrophil

## 
## $Monocyte

## 
## $Plasmocyte

## 
## $CD4

## 
## $CD8

## 
## $Macrophage

## 
## $memory_B_cell

## 
## $naive_B_cell

Performances

Mean Expression heatmap

ROC curve